import pandas as pd
import numpy as np
import warnings
from pandas_profiling import ProfileReport
import sys
#visualization
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sktime.utils.plotting import plot_series, plot_lags, plot_correlations
#from visuals import *
#config to clean up the notebook
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings('ignore')
#read the training data and parse the dates into a datetime object
df = pd.read_csv(
'train.csv',
usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
dtype={
'store_nbr' : 'category',
'family' : 'category',
'sales' : 'float32',
},
parse_dates=['date'],
infer_datetime_format=True,
)
#convert the date into a period
#df['date'] = df.date.dt.to_period('D')
#set a multiindex and sort to clean up the display of dataframe
df = df.set_index(['store_nbr', 'family', 'date']).sort_index()
df.head()
| sales | onpromotion | |||
|---|---|---|---|---|
| store_nbr | family | date | ||
| 1 | AUTOMOTIVE | 2013-01-01 | 0.00 | 0 |
| 2013-01-02 | 2.00 | 0 | ||
| 2013-01-03 | 3.00 | 0 | ||
| 2013-01-04 | 3.00 | 0 | ||
| 2013-01-05 | 5.00 | 0 |
#aggregate the stores daily sales by product family
df_family = (
df.groupby(['family', 'date']).mean().unstack('family')
)
display(df_family.head())
df_family.shape
| sales | onpromotion | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| family | AUTOMOTIVE | BABY CARE | BEAUTY | BEVERAGES | BOOKS | BREAD/BAKERY | CELEBRATION | CLEANING | DAIRY | DELI | EGGS | FROZEN FOODS | GROCERY I | GROCERY II | HARDWARE | HOME AND KITCHEN I | HOME AND KITCHEN II | HOME APPLIANCES | HOME CARE | LADIESWEAR | LAWN AND GARDEN | LINGERIE | LIQUOR,WINE,BEER | MAGAZINES | MEATS | PERSONAL CARE | PET SUPPLIES | PLAYERS AND ELECTRONICS | POULTRY | PREPARED FOODS | PRODUCE | SCHOOL AND OFFICE SUPPLIES | SEAFOOD | AUTOMOTIVE | BABY CARE | BEAUTY | BEVERAGES | BOOKS | BREAD/BAKERY | CELEBRATION | CLEANING | DAIRY | DELI | EGGS | FROZEN FOODS | GROCERY I | GROCERY II | HARDWARE | HOME AND KITCHEN I | HOME AND KITCHEN II | HOME APPLIANCES | HOME CARE | LADIESWEAR | LAWN AND GARDEN | LINGERIE | LIQUOR,WINE,BEER | MAGAZINES | MEATS | PERSONAL CARE | PET SUPPLIES | PLAYERS AND ELECTRONICS | POULTRY | PREPARED FOODS | PRODUCE | SCHOOL AND OFFICE SUPPLIES | SEAFOOD |
| date | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 2013-01-01 | 0.00 | 0.00 | 0.04 | 15.00 | 0.00 | 3.34 | 0.00 | 3.44 | 2.65 | 1.32 | 0.85 | 0.55 | 12.96 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.04 | 0.09 | 1.94 | 0.00 | 2.05 | 0.46 | 0.00 | 0.00 | 0.79 | 0.70 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 2013-01-02 | 4.72 | 0.00 | 3.83 | 1335.04 | 0.00 | 486.04 | 0.00 | 1382.02 | 432.98 | 291.75 | 202.44 | 131.77 | 3741.11 | 27.33 | 0.85 | 0.00 | 0.00 | 0.39 | 0.00 | 0.00 | 2.24 | 8.69 | 44.65 | 0.00 | 386.51 | 318.59 | 0.00 | 0.00 | 258.81 | 98.85 | 0.00 | 0.00 | 28.27 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 2013-01-03 | 2.98 | 0.00 | 2.31 | 964.91 | 0.00 | 341.79 | 0.00 | 1035.06 | 333.35 | 206.90 | 136.26 | 88.16 | 2682.93 | 19.41 | 0.69 | 0.00 | 0.00 | 0.28 | 0.00 | 0.00 | 1.54 | 6.78 | 45.85 | 0.00 | 307.36 | 232.74 | 0.00 | 0.00 | 197.67 | 66.51 | 0.00 | 0.00 | 20.27 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 2013-01-04 | 3.13 | 0.00 | 2.46 | 1003.09 | 0.00 | 309.67 | 0.00 | 964.15 | 336.07 | 187.84 | 125.19 | 83.81 | 2513.96 | 19.09 | 1.06 | 0.00 | 0.00 | 0.24 | 0.00 | 0.00 | 2.35 | 7.07 | 88.81 | 0.00 | 400.48 | 209.31 | 0.00 | 0.00 | 199.49 | 82.83 | 0.00 | 0.00 | 23.95 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 2013-01-05 | 6.33 | 0.00 | 3.54 | 1441.07 | 0.00 | 414.22 | 0.00 | 1298.67 | 427.44 | 254.35 | 158.81 | 107.07 | 3488.07 | 23.57 | 1.61 | 0.00 | 0.00 | 0.20 | 0.00 | 0.00 | 3.33 | 8.48 | 124.35 | 0.00 | 386.65 | 311.46 | 0.00 | 0.00 | 249.54 | 107.96 | 0.00 | 0.00 | 23.07 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
(1684, 66)
#lets plot out each family sales time series to take an initial look at the data
for col in df_family.sales.columns:
plot_series(df_family.sales[col])
We can see a few interesting things right away:
#lets check what christmas day looks like in each year
temp = df_family.sales.sum(axis=1)
display(temp.loc[np.logical_and(temp.index.day == 25, temp.index.month == 12)])
#lets check to see if there are any other missing dates
pd.date_range(start = '2013-01-01', end='2017-08-15').difference(temp.index)
Series([], dtype: float32)
DatetimeIndex(['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25'], dtype='datetime64[ns]', freq=None)
#lets see where the big gap in produce happens as an example for issue 2
df_family.sales[df_family.sales['CELEBRATION'] == 0.00].CELEBRATION.loc['2015']
2015-01-01 0.00
2015-01-02 0.00
2015-01-03 0.00
2015-01-04 0.00
2015-01-05 0.00
...
2015-05-27 0.00
2015-05-28 0.00
2015-05-29 0.00
2015-05-30 0.00
2015-05-31 0.00
Name: CELEBRATION, Length: 151, dtype: float32
# lets try plotting the series again starting on 2015-06-01
date_filter = df_family.sales.loc['2015-06-01' : '2017-08-15']
for col in date_filter.columns:
plot_series(date_filter[col])
Next lets deal with the missing christmas dates. In prediction, we can just utilize a conditional zero-prediction strategy to replace the predictions our models might make, since we know that the stores will be closed on this day. To help our models out, however, lets impute a 5 record average centered around the christmas dates. This will give our models a time series with no gaps, while causing minimal disruption to the signal.
#lets filter the entire dataframe by the previous date range
df_filtered = df_family.loc['2015-06-01' : '2017-08-15']
#lets add blank records for christmas days
idx = pd.to_datetime(['2015-12-25', '2016-12-25'], format='%Y/%m/%d')
new_row = pd.DataFrame(columns = df_filtered.columns, index=idx)
new_row.fillna(value=0.00, axis=1, inplace=True)
#lets add them into the filtered dataframe and sort the index
df_filtered = pd.concat([df_filtered, pd.DataFrame(new_row)], ignore_index=False).sort_index()
#now lets impute the values for each column as a centered average of its 4 neighbors
for i in idx:
for col in df_filtered.sales.columns:
df_filtered.sales.loc[i][col] = (
df_filtered.sales.loc[i- pd.Timedelta(2, unit='days'), col] +
df_filtered.sales.loc[i- pd.Timedelta(1, unit='days'), col] +
df_filtered.sales.loc[i+ pd.Timedelta(1, unit='days'), col] +
df_filtered.sales.loc[i+ pd.Timedelta(2, unit='days'), col]
) / 4.00
df_filtered.onpromotion.loc[i][col] = (
df_filtered.onpromotion.loc[i- pd.Timedelta(2, unit='days'), col] +
df_filtered.onpromotion.loc[i- pd.Timedelta(1, unit='days'), col] +
df_filtered.onpromotion.loc[i+ pd.Timedelta(1, unit='days'), col] +
df_filtered.onpromotion.loc[i+ pd.Timedelta(2, unit='days'), col]
) / 4.00
#now lets plot those time plots again
#lets make the indexes for new years day
idx = pd.to_datetime(['2016-01-01', '2017-01-01'], format='%Y/%m/%d')
#now lets impute the values for each column as a centered average of its 4 neighbors
for i in idx:
for col in df_filtered.sales.columns:
df_filtered.sales.loc[i][col] = (
df_filtered.sales.loc[i- pd.Timedelta(2, unit='days'), col] +
df_filtered.sales.loc[i- pd.Timedelta(1, unit='days'), col] +
df_filtered.sales.loc[i+ pd.Timedelta(1, unit='days'), col] +
df_filtered.sales.loc[i+ pd.Timedelta(2, unit='days'), col]
) / 4.00
df_filtered.onpromotion.loc[i][col] = (
df_filtered.onpromotion.loc[i- pd.Timedelta(2, unit='days'), col] +
df_filtered.onpromotion.loc[i- pd.Timedelta(1, unit='days'), col] +
df_filtered.onpromotion.loc[i+ pd.Timedelta(1, unit='days'), col] +
df_filtered.onpromotion.loc[i+ pd.Timedelta(2, unit='days'), col]
) / 4.00
#now lets plot those time plots again
for col in df_filtered.sales.columns:
plot_series(df_filtered.sales[col])
Lets see if we can take a look at some of these outliers and figure out if there is anything we should do about them
#the meats family has a very obvious single value outlier point, lets look at it
df_filtered.sales.loc[df_filtered.sales.MEATS > 1000]
#its on 2016-10-07
df_filtered.onpromotion.MEATS.loc['2016-10-05' : '2016-10-09']
2016-10-05 0.04 2016-10-06 8.87 2016-10-07 21.80 2016-10-08 0.02 2016-10-09 0.00 Name: MEATS, dtype: float64
#BABY CARE has another one of these lets take a look
df_filtered.sales.loc[df_filtered.sales['BABY CARE'] > 1.5]
#its on 2015-08-04
display(df_filtered.onpromotion['BABY CARE'].loc['2015-08-02' : '2015-08-06'])
df_filtered.sales['BABY CARE'].loc['2015-08-02' : '2015-08-06']
2015-08-02 0.00 2015-08-03 0.00 2015-08-04 0.00 2015-08-05 0.00 2015-08-06 0.00 Name: BABY CARE, dtype: float64
2015-08-02 0.28 2015-08-03 0.15 2015-08-04 2.31 2015-08-05 0.19 2015-08-06 0.28 Name: BABY CARE, dtype: float64
There are many ways we could try to deal with this but I will choose to change 2.31 to 0.31 to preserve the peak of this day somewhat with a lesser impact.
#change the value of BABY CARE on 2015-08-04 to remove the outlier
df_filtered.sales['BABY CARE'].loc['2015-08-04'] = 0.31
plot_series(df_filtered.sales['BABY CARE'])
(<Figure size 1600x400 with 1 Axes>, <AxesSubplot: ylabel='BABY CARE'>)
transactions = pd.read_csv(
'transactions.csv',
usecols=['date', 'store_nbr', 'transactions'],
dtype={
'store_nbr' : 'category',
'transactions' : 'int'
},
parse_dates=['date'],
infer_datetime_format=True)
transactions = transactions.set_index(['store_nbr', 'date']).sort_index()
df_transactions = (
transactions.groupby(['store_nbr', 'date']).sum().unstack('store_nbr')
)
df_transactions.head()
| transactions | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| store_nbr | 1 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 2 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 3 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 4 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 5 | 50 | 51 | 52 | 53 | 54 | 6 | 7 | 8 | 9 |
| date | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 2013-01-01 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 770 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2013-01-02 | 2111 | 1293 | 3547 | 1362 | 1102 | 2002 | 1622 | 1167 | 1580 | 1635 | 1369 | 2358 | 0 | 0 | 0 | 1381 | 2605 | 1038 | 1008 | 1386 | 950 | 0 | 3487 | 708 | 1401 | 776 | 1163 | 2724 | 786 | 0 | 1691 | 1987 | 1587 | 1922 | 1178 | 915 | 0 | 1385 | 4821 | 4208 | 4886 | 4161 | 3397 | 2346 | 1903 | 3077 | 1985 | 0 | 0 | 998 | 2143 | 1874 | 3250 | 2940 |
| 2013-01-03 | 1833 | 1157 | 2675 | 1248 | 916 | 1823 | 1512 | 1101 | 1339 | 1484 | 1098 | 2033 | 0 | 0 | 0 | 1181 | 2462 | 887 | 813 | 928 | 749 | 0 | 3026 | 689 | 1126 | 727 | 1168 | 2348 | 725 | 0 | 1526 | 1746 | 1426 | 1551 | 1171 | 809 | 0 | 1334 | 3618 | 3314 | 3438 | 3660 | 2887 | 1702 | 1740 | 2307 | 1644 | 0 | 0 | 920 | 1795 | 1568 | 2904 | 2396 |
| 2013-01-04 | 1863 | 970 | 2515 | 1064 | 879 | 1641 | 1318 | 977 | 1139 | 1238 | 1003 | 2066 | 0 | 0 | 0 | 1257 | 2607 | 1054 | 813 | 1034 | 651 | 0 | 3188 | 689 | 1246 | 680 | 948 | 2369 | 630 | 0 | 1701 | 1723 | 1285 | 1596 | 1044 | 835 | 0 | 1358 | 4169 | 3630 | 3434 | 3915 | 2900 | 2016 | 1642 | 2698 | 1786 | 0 | 0 | 794 | 1679 | 1513 | 2962 | 1975 |
| 2013-01-05 | 1509 | 1269 | 3052 | 1293 | 1336 | 2052 | 1543 | 1109 | 1433 | 1323 | 1713 | 2062 | 0 | 0 | 0 | 1250 | 2578 | 1355 | 1081 | 1327 | 1033 | 0 | 3623 | 749 | 1317 | 890 | 1217 | 2599 | 723 | 0 | 1770 | 2214 | 1851 | 1825 | 1118 | 1021 | 0 | 1205 | 4921 | 4331 | 4935 | 4764 | 4084 | 2562 | 1643 | 3459 | 2068 | 0 | 0 | 949 | 2154 | 1599 | 3060 | 2604 |
Lets take a look at that!
for col in df_transactions.columns:
plot_series(df_transactions[col])
Lets take a closer look at the exact dates we have selected in the training set
df_transactions_filtered = df_transactions.loc['2015-06-01' : '2017-08-15']
labels = ['20', '21', '22', '29', '42', '52', '53']
for col in labels:
fig, ax = plot_series(df_transactions_filtered.transactions[col])
Lets look at the oil price data next
df_oil = pd.read_csv(
'oil.csv',
usecols=['date', 'dcoilwtico'],
dtype={
'dcoilwtico' : 'float32'
},
parse_dates=['date'],
infer_datetime_format=True)
df_oil = df_oil.set_index(['date']).sort_index()
display(df_oil.info())
missing = pd.date_range(start = '2013-01-01', end='2017-08-15').difference(df_oil.index)
display(missing.shape)
plot_series(df_oil)
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 1218 entries, 2013-01-01 to 2017-08-31 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dcoilwtico 1175 non-null float32 dtypes: float32(1) memory usage: 14.3 KB
None
(482,)
(<Figure size 1600x400 with 1 Axes>, <AxesSubplot: >)
#add rows for missing dates
df_oil = df_oil.dcoilwtico.resample('D').sum().reset_index()
#interpolate the missing prices
df_oil.rename(columns={0 : 'dcoilwtico'}, inplace=True)
df_oil['dcoilwtico'] = np.where(df_oil['dcoilwtico'] == 0, np.nan, df_oil['dcoilwtico'])
df_oil_interp = df_oil.dcoilwtico.interpolate()
df_oil.set_index('index', inplace=True)
df_oil_interp = pd.DataFrame(index=df_oil.index, data=df_oil_interp.values, columns=['dcoilwtico'])
fig, ax = plot_series(df_oil, title='original data')
fig, ax = plot_series(df_oil_interp, title='interpolated data')
df_oil_interp.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 1704 entries, 2013-01-01 to 2017-08-31 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dcoilwtico 1703 non-null float32 dtypes: float32(1) memory usage: 20.0 KB
Lets take a look at the holidays data
holidays = pd.read_csv(
'holidays_events.csv',
usecols=['date', 'type', 'locale', 'transferred'],
dtype={
'type' : 'category',
'locale' : 'category',
'transferred' : 'bool'
},
parse_dates=['date'],
infer_datetime_format=True)
holidays = holidays.set_index(['date']).sort_index()
display(holidays.info())
display(holidays.value_counts())
df_holidays = holidays.loc['2015-06-01' : '2017-08-15']
df_holidays.shape
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 350 entries, 2012-03-02 to 2017-12-26 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 type 350 non-null category 1 locale 350 non-null category 2 transferred 350 non-null bool dtypes: bool(1), category(2) memory usage: 4.1 KB
None
type locale transferred Holiday Local False 133 Event National False 56 Holiday National False 52 Additional National False 40 Holiday Regional False 24 Additional Local False 11 Holiday National True 8 Transfer National False 8 Bridge National False 5 Work Day National False 5 Holiday Local True 4 Transfer Local False 4 dtype: int64
(152, 3)
Now that we have check and done some basic cleaning of the dataset, we can move on to some exploratory analysis to inform the building of our prediction model
A time series is said to be stationary if its mean, variance and covariance do not vary with time. This property is a requirement for some forms of statistical modeling. One way to determing this is through the use of two different statistical tests. The Augmented Dickey-Fuller test, and the MPSS test.
#lets write a function to perform and display an Augmented Dickey Fuller test
#And a KPSS test to check for stationarity
from statsmodels.tsa.stattools import adfuller, kpss
def adf_kpss_test(ts):
print('Augmented Dickey-Fuller: ')
adftest = pd.Series(
adfuller(ts)[0:4],
index=['Test Statistic', 'p-value', 'Num Lags', 'Num Obs']
)
print(adftest)
for key, value in adfuller(ts)[4].items():
print('Critical Value {} : {}'.format(key, value))
print("\nKPSS: ")
kpsstest = pd.Series(
kpss(ts)[0:3], index=['Test Statistic', 'p-value', 'Num Lags']
)
print(kpsstest)
for key, value in kpss(ts)[3].items():
print('Critical Value {} : {}'.format(key,value))
for col in df_analysis.sales.columns:
print('TESTING {}'.format(col))
adf_kpss_test(df_analysis.sales[col])
print('\n')
TESTING AUTOMOTIVE Augmented Dickey-Fuller: Test Statistic -4.20 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.47 p-value 0.05 Num Lags 12.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING BABY CARE Augmented Dickey-Fuller: Test Statistic -3.03 p-value 0.03 Num Lags 13.00 Num Obs 793.00 dtype: float64 Critical Value 1% : -3.438623132449471 Critical Value 5% : -2.8651915799370014 Critical Value 10% : -2.568714441670417 KPSS: Test Statistic 1.26 p-value 0.01 Num Lags 15.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING BEAUTY Augmented Dickey-Fuller: Test Statistic -3.24 p-value 0.02 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 3.16 p-value 0.01 Num Lags 9.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING BEVERAGES Augmented Dickey-Fuller: Test Statistic -3.33 p-value 0.01 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 1.65 p-value 0.01 Num Lags 16.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING BOOKS Augmented Dickey-Fuller: Test Statistic -2.47 p-value 0.12 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 1.00 p-value 0.01 Num Lags 17.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING BREAD/BAKERY Augmented Dickey-Fuller: Test Statistic -3.33 p-value 0.01 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.83 p-value 0.01 Num Lags 68.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING CELEBRATION Augmented Dickey-Fuller: Test Statistic -5.36 p-value 0.00 Num Lags 16.00 Num Obs 790.00 dtype: float64 Critical Value 1% : -3.4386546523763837 Critical Value 5% : -2.865205472974755 Critical Value 10% : -2.568721842653421 KPSS: Test Statistic 0.35 p-value 0.10 Num Lags 19.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING CLEANING Augmented Dickey-Fuller: Test Statistic -5.18 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 1.45 p-value 0.01 Num Lags 4.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING DAIRY Augmented Dickey-Fuller: Test Statistic -2.88 p-value 0.05 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 3.56 p-value 0.01 Num Lags 10.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING DELI Augmented Dickey-Fuller: Test Statistic -4.67 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.60 p-value 0.02 Num Lags 32.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING EGGS Augmented Dickey-Fuller: Test Statistic -2.75 p-value 0.07 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.67 p-value 0.02 Num Lags 101.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING FROZEN FOODS Augmented Dickey-Fuller: Test Statistic -5.76 p-value 0.00 Num Lags 10.00 Num Obs 796.00 dtype: float64 Critical Value 1% : -3.4385918508847646 Critical Value 5% : -2.865177791819505 Critical Value 10% : -2.568707096601096 KPSS: Test Statistic 0.07 p-value 0.10 Num Lags 17.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING GROCERY I Augmented Dickey-Fuller: Test Statistic -4.71 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 1.57 p-value 0.01 Num Lags 9.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING GROCERY II Augmented Dickey-Fuller: Test Statistic -4.00 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.49 p-value 0.04 Num Lags 15.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING HARDWARE Augmented Dickey-Fuller: Test Statistic -3.33 p-value 0.01 Num Lags 20.00 Num Obs 786.00 dtype: float64 Critical Value 1% : -3.438697054464251 Critical Value 5% : -2.8652241623148678 Critical Value 10% : -2.5687317987167284 KPSS: Test Statistic 1.04 p-value 0.01 Num Lags 9.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING HOME AND KITCHEN I Augmented Dickey-Fuller: Test Statistic -3.75 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.41 p-value 0.07 Num Lags 15.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING HOME AND KITCHEN II Augmented Dickey-Fuller: Test Statistic -5.45 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.24 p-value 0.10 Num Lags 14.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING HOME APPLIANCES Augmented Dickey-Fuller: Test Statistic -2.61 p-value 0.09 Num Lags 20.00 Num Obs 786.00 dtype: float64 Critical Value 1% : -3.438697054464251 Critical Value 5% : -2.8652241623148678 Critical Value 10% : -2.5687317987167284 KPSS: Test Statistic 0.59 p-value 0.02 Num Lags 17.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING HOME CARE Augmented Dickey-Fuller: Test Statistic -6.69 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.76 p-value 0.01 Num Lags 12.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING LADIESWEAR Augmented Dickey-Fuller: Test Statistic -4.82 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.14 p-value 0.10 Num Lags 78.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING LAWN AND GARDEN Augmented Dickey-Fuller: Test Statistic -1.44 p-value 0.56 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 3.54 p-value 0.01 Num Lags 15.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING LINGERIE Augmented Dickey-Fuller: Test Statistic -2.89 p-value 0.05 Num Lags 20.00 Num Obs 786.00 dtype: float64 Critical Value 1% : -3.438697054464251 Critical Value 5% : -2.8652241623148678 Critical Value 10% : -2.5687317987167284 KPSS: Test Statistic 0.71 p-value 0.01 Num Lags 15.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING LIQUOR,WINE,BEER Augmented Dickey-Fuller: Test Statistic -5.15 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.15 p-value 0.10 Num Lags 2.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING MAGAZINES Augmented Dickey-Fuller: Test Statistic -3.58 p-value 0.01 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 1.83 p-value 0.01 Num Lags 15.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING MEATS Augmented Dickey-Fuller: Test Statistic -5.53 p-value 0.00 Num Lags 20.00 Num Obs 786.00 dtype: float64 Critical Value 1% : -3.438697054464251 Critical Value 5% : -2.8652241623148678 Critical Value 10% : -2.5687317987167284 KPSS: Test Statistic 0.83 p-value 0.01 Num Lags 54.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING PERSONAL CARE Augmented Dickey-Fuller: Test Statistic -5.81 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.47 p-value 0.05 Num Lags 3.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING PET SUPPLIES Augmented Dickey-Fuller: Test Statistic -1.49 p-value 0.54 Num Lags 20.00 Num Obs 786.00 dtype: float64 Critical Value 1% : -3.438697054464251 Critical Value 5% : -2.8652241623148678 Critical Value 10% : -2.5687317987167284 KPSS: Test Statistic 5.04 p-value 0.01 Num Lags 12.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING PLAYERS AND ELECTRONICS Augmented Dickey-Fuller: Test Statistic -3.72 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 1.33 p-value 0.01 Num Lags 11.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING POULTRY Augmented Dickey-Fuller: Test Statistic -5.17 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.10 p-value 0.10 Num Lags 242.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING PREPARED FOODS Augmented Dickey-Fuller: Test Statistic -1.61 p-value 0.48 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 1.07 p-value 0.01 Num Lags 12.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING PRODUCE Augmented Dickey-Fuller: Test Statistic -3.79 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.77 p-value 0.01 Num Lags 15.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING SCHOOL AND OFFICE SUPPLIES Augmented Dickey-Fuller: Test Statistic -2.90 p-value 0.05 Num Lags 17.00 Num Obs 789.00 dtype: float64 Critical Value 1% : -3.4386652124595614 Critical Value 5% : -2.865210127510208 Critical Value 10% : -2.5687243221835088 KPSS: Test Statistic 0.43 p-value 0.06 Num Lags 17.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739 TESTING SEAFOOD Augmented Dickey-Fuller: Test Statistic -4.63 p-value 0.00 Num Lags 21.00 Num Obs 785.00 dtype: float64 Critical Value 1% : -3.438707722728768 Critical Value 5% : -2.8652288644673125 Critical Value 10% : -2.568734303622865 KPSS: Test Statistic 0.62 p-value 0.02 Num Lags 36.00 dtype: float64 Critical Value 10% : 0.347 Critical Value 5% : 0.463 Critical Value 2.5% : 0.574 Critical Value 1% : 0.739
For the Augmented Dickey-Fuller, the test is as follows:
H0 : The series has a unit root (Non-Stationary)
HA : The series has no unit root (Stationary)
For MPSS, the test is as follows:
H0 : The process is trend stationary (Stationary)
HA : The series has a unit root (Non-Stationary)
For non-stationary data, combining the results of both tests can potentially tell us more about how we could make the series stationary.
Strict Stationary: both tests claim series is stationary
Non Stationary: both tests claim series is not stationary
Trend Stationary: KPSS claims stationary, ADF claims non-stationary
Difference Stationary: ADF claims stationary, KPSS claims non-stationary
To make a trend stationary series strict stationary, one must remove the trend
To make a difference stationary series strict stationary, one must apply differencing
AUTOMOTIVE : Difference Stationary
BABY CARE : Difference Stationary
BEAUTY : Difference Stationary
BEVERAGES : Difference Stationary
BOOKS : Non Stationary
BREAD/BAKERY : Difference Stationary
CELEBRATION : Stationary
CLEANING : Difference Stationary
DAIRY : Difference Stationary
DELI : Difference Stationary
EGGS : Non Stationary
FROZEN FOODS : Trend Stationary
GROCERY I : Difference Stationary
GROCERY II : Difference Stationary
HARDWARE : Difference Stationary
HOME KITCHEN I : Stationary
HOME KITCHEN II: Stationary
HOME APPLIANCES: Non Stationary
HOME CARE : Difference Stationary
LADIESWEAR : Stationary
LAWN AND GARDEN: Non Stationary
LINGERIE : Difference Stationary
LIQUOR,WINEBEER: Stationary
MAGAZINES : Difference Stationary
MEATS : Difference Stationary
PERSONAL CARE : Difference Stationary
PET SUPPLIES : Non Stationary
PLAYERS ELECTRO: Difference Stationary
POULTRY : Stationary
PREPARED FOODS : Non Stationary
PRODUCE : Difference Stationary
SCHOOL AND OFFI: Stationary
SEAFOOD : Difference Stationary
We can see that most of our data is not stationary, although it could be made stationary by applying some form of differencing. It is also important to note that these tests will not capture every form of stationarity, and visual inspection should be used as well to sanity test.
from sktime.transformations.series.detrend import STLTransformer
def decompose_analysis(ts, return_components=True, robust=True, sp=28):
transformer = STLTransformer(return_components=return_components, robust=robust, sp=sp)
Xt = transformer.fit_transform(ts)
plot_series(ts, Xt['trend'])
plot_correlations(Xt['trend'])
plot_correlations(Xt['seasonal'])
plot_correlations(Xt['resid'])
print(Xt['resid'].mean())
for col in df_analysis.sales:
print('DECOMPOSING ---- {}'.format(col))
decompose_analysis(df_analysis.sales[col])
DECOMPOSING ---- AUTOMOTIVE 0.1694469872049522 DECOMPOSING ---- BABY CARE 0.014248868471554493 DECOMPOSING ---- BEAUTY 0.10300837116581618 DECOMPOSING ---- BEVERAGES 62.41388865517964 DECOMPOSING ---- BOOKS 0.018398195444936673 DECOMPOSING ---- BREAD/BAKERY 2.3725367541702544 DECOMPOSING ---- CELEBRATION 0.3523701644625468 DECOMPOSING ---- CLEANING 29.146322486611677 DECOMPOSING ---- DAIRY 12.920540236478452 DECOMPOSING ---- DELI 5.895242367560568 DECOMPOSING ---- EGGS 2.0808543642764654 DECOMPOSING ---- FROZEN FOODS 40.04585252804542 DECOMPOSING ---- GROCERY I 161.10259016897734 DECOMPOSING ---- GROCERY II 1.2771208269010688 DECOMPOSING ---- HARDWARE 0.011964410274961963 DECOMPOSING ---- HOME AND KITCHEN I 1.5511170581598746 DECOMPOSING ---- HOME AND KITCHEN II 1.4481383201466487 DECOMPOSING ---- HOME APPLIANCES 0.01814486139248679 DECOMPOSING ---- HOME CARE 8.409644052526994 DECOMPOSING ---- LADIESWEAR 0.1865585924187831 DECOMPOSING ---- LAWN AND GARDEN 0.5294507390330708 DECOMPOSING ---- LINGERIE 0.11577442597072785 DECOMPOSING ---- LIQUOR,WINE,BEER 9.554367934283515 DECOMPOSING ---- MAGAZINES 0.2810436147822098 DECOMPOSING ---- MEATS 3.3120145718087026 DECOMPOSING ---- PERSONAL CARE 12.099078614123242 DECOMPOSING ---- PET SUPPLIES 0.10240500457406766 DECOMPOSING ---- PLAYERS AND ELECTRONICS 0.408304135650565 DECOMPOSING ---- POULTRY 6.307179151658991 DECOMPOSING ---- PREPARED FOODS 1.0647179361095442 DECOMPOSING ---- PRODUCE 29.903122434799926 DECOMPOSING ---- SCHOOL AND OFFICE SUPPLIES 0.8726407831212354 DECOMPOSING ---- SEAFOOD 0.09352993684983515
When we ran the STL decomposition, we used a period=21, this is a multiple of the strongest observed seasonality (weekly), that captures a certain smoothness of linear trend, or put another way, allows the capture of a certain amount of signal by the seasonality component. The higher the period, the smoother the trend line will be which will allow for more variance to be captured by the seasonal component.
We can see that this setting worked quite well for most of the product families, by examining the correlograms of the different components.
For trend correlograms we should see ACF plots that slowly decrease over time and PACF plots that have a strong values at lag 1 and then drop close to 0 after that. This is indicative that there is little to no seasonality present in the data.
For seasonality correlograms we should see strong representations in the ACF at the strongest lag values and their multiples, with little to no decay. In the PACF plot for seasonal components we should see strong representations at the strongest lag values and their multiples with decay as the multiples increase. Especially the ACF part of this shows little to no linear trend present in the seasonal component.
For residual correlograms we should see neither of the things previously mentioned that describe trend and seasonal components. We should see a ACF and PACF with little to no strength at any lag beyond maybe the 1st lag.
A very good example of this is the SEAFOOD family plots directly above this cell.
Lag 7 and its multiples show high ACF throughout most of the familes Lags 3, 4, 10, 11, 17, 18, 24
By examining these plots we can determine where the strong seasonalities lie that can give us some indication of the types of features we would want to use as inputs to our predictive model. We can also just get a good overall idea of what the different components in these time series look like.
We could also use this method of decomposition to produce forecasts, and then pass the residuals on to a different model such as a GBM or Neural Network, combining these forecasts together in a hybrid model
#lets output our cleaned dataframe to a csv for use in our model notebook.
df_analysis.to_csv('sales_clean.csv')